Null Model of Graphs

Preliminaries

Load in the packages we need…

library(igraph)
library(ggraph)
library(cowplot)
library(tidyverse)

# create a custom function we will use to plot degree distributions...
plot_dd <- function(g) {
  d <- tibble(
  degree = seq(0, length(degree_distribution(g))-1, by = 1),
  rel_freq = degree_distribution(g)
)
  ggplot(d, aes(x= degree, y = rel_freq)) +
    geom_bar(stat = "identity") +
    xlab("Degree") +
    ylab("Relative Frequency") +
    scale_x_continuous(breaks = seq(0, length(degree_distribution(g))-1, by = 1)) +
    ggtitle(paste0( vcount(g), " nodes, ", ecount(g), " edges\nDeg Dist: (mean deg: ", round(mean(degree(g)),2), ", var deg: ", round(var(degree(g)), 2), ")\nGlobal Transitivity (Clustering Coeff): ", round(transitivity(g, type = "global"), 4), "\nDiameter: ", diameter(g), "\nMean Distance: ", round(mean_distance(g), 4), "\nEdge Density: ", round(edge_density(g),4)))
}

Erdos-Renyi Random Networks

Erdos-Reyni simple random graphs can be created by specifying a number of nodes and either the total number of edges in the graph OR a probability of any given pair of nodes being connected.

# specify the number of nodes, n, and the total number of edges in the graph, m
g <-sample_gnm(n = 12, m = 12) # run this several times
plot(g)

# or

g <-sample_gnm(n = 12, m = 12) %>% plot()

g <- sample_gnm(n = 12, m = 66) %>% plot()

NOTE: A graph with 12 nodes can have up to (12 x 11)/2 = 66 total edges, thus the line of code above results in a graph with the 100% of dyads connected.

# specify the number of nodes and the probability of any two nodes being connected
g <- sample_gnp(n = 12, p = 0.182) %>% plot()

g <- sample_gnp(n = 12, p = 1) %>% plot()

# note that with p = 1, all of dyads are connected

Create and visualize 100 random graphs with the same properties using sample_gnm()

We first use the list() function to create empty lists to hold the simulated graphs and their properties…

g <- list()
deg <- list()
diam <- list()
plot <- list()
trans <- list()

n <- 12
m <- 12
for (i in 1:100){
  g[[i]] <- sample_gnm(n = n, m = m)
  diam[[i]] <- diameter(g[[i]])
  deg[[i]] <- mean(degree(g[[i]]))
  plot[[i]] <- ggraph(g[[i]], layout = layout_in_circle(g[[i]])) +
    geom_edge_link() +
    geom_node_point(size = 1, shape=21, fill="#A70042") +
    theme(legend.position = "none")
  trans[[i]] <- transitivity(g[[i]], type = "global")
  }

# draw a histogram of the distribution of diameters of the 100 graphs
hist(
  unlist(diam),
  breaks = seq(from = -0.5, to = 11.5, by = 1))

# calculate the expected mean degree across the 100 graphs
(exp_mean_deg <- (m/(n * (n-1)/2))) * (n - 1)
[1] 2
# calculate the observed mean degree across the 100 graphs
(calc_mean_deg <- mean(unlist(deg)))
[1] 2
# calculate the global transitivity (clustering coefficient) across the 100 graphs
(calc_mean_trans <- mean(unlist(trans)))
[1] 0.143347
# use the `plot_grid()` function from {cowplot} to plot all graphs in the list "plot"
do.call("plot_grid", plot)

Create and visualize 100 random graphs with the same properties using sample_gnp()

g <- list()
deg <- list()
diam <- list()
plot <- list()
trans <- list()

n <- 12
p <- 0.182
for (i in 1:100){
  g[[i]] <- sample_gnp(n = n, p = p)
  deg[[i]] <- mean(degree(g[[i]]))
  diam[[i]] <- diameter(g[[i]])
  plot[[i]] <- ggraph(g[[i]], layout = layout_in_circle(g[[i]])) +
    geom_edge_link() +
    geom_node_point(size = 1, shape=21, fill="#A70042") +
    theme(legend.position = "none")
  trans[[i]] <- transitivity(g[[i]], type = "global")
}

hist(
  unlist(diam),
  breaks = seq(from = -0.5, to = 11.5, by = 1))

(exp_mean_deg <- p * (n - 1))
[1] 2.002
(calc_mean_deg <- mean(unlist(deg)))
[1] 2.003333
(calc_mean_trans <- mean(unlist(trans)))
[1] 0.1375247
do.call("plot_grid", plot)

Important Point 1

For a large random graph, the degree distribution is Poisson, i.e., it is centered on the average degree and the variance is around the same as the average degree…

n <- 5000 # set a large number of nodes
average_degee <- 15 # try alternatives, e.g., 5, 25, 40
g <- sample_gnp(n = n, p = average_degee/(n - 1))

# use a custom function to plot the degree distribution
plot_dd(g)

Important Point 2

Random graphs become nearly fully connected even at low average degree…

n <- 5000 # graph size... try 500, 1000, 2000, 10000
plot <- list() # list to hold plots for each average degree
prop <- list() # list to hold proportion of nodes in largest component
for (i in 1:12) { # i holds average degree...
  g <- sample_gnp(n = n, p = i/(n - 1))
  d <- tibble(
    degree = (1:length(degree_distribution(g))-1),
    freq = degree_distribution(g))
  plot[[i]] <- ggplot(d, aes(x = degree, y = freq)) +
    geom_line() +
    ggtitle(paste0(n, " nodes\nmean deg ", i))
  prop[[i]] <- components(g)$csize[[1]]/n # proportion of nodes in largest component
  }

do.call("plot_grid", plot)

d <- tibble(
  average_degree = c(1:12),
  proportion = unlist(prop))

ggplot(d, aes(x = average_degree, y = proportion)) +
  geom_line() +
  xlab("Average Degree") +
  ylab("Proportion of Nodes\nin Largest Component") +
  scale_x_continuous(breaks = c(1:12))

Important Point 3

Connected components of random graphs are compact, i.e., the diameter of the largest component is small even for large networks…

n <- rep(c(100, 500, 1000, 2500, 5000, 7500, 10000, 15000, 20000, 30000), each = 5)
average_degree <- 12 # try different average degrees
diam <- list() # list to hold diameter of largest component
size <- list() # list to hold size of graph
prop <- list()

for (i in 1:length(n)) { # i network size...
  g <- sample_gnp(n = n[i], p = average_degree/(n[i] - 1))
  size[[i]] <- n[i]
  diam[[i]] <- diameter(g) # this command takes some time to run on large graphs!
  prop[[i]] <- components(g)$csize[[1]]/n[i] # proportion of nodes in largest component
  }

d <- tibble(
  n = unlist(size),
  diam = unlist(diam),
  prop = unlist(prop))

ggplot(d, aes(x = n, y = diam, alpha = 0.1)) +
  geom_jitter(height = 0.1) +
  xlab("Number of Nodes") +
  scale_y_continuous(breaks = seq(0, max(d$diam) + 1, by = 1)) +
  ylab("Diameter of Largest Component") +
  theme(legend.position = "none")

Small World Random Graphs

The small-world model of Watts and Strogatz begins with a ring of nodes where each node is connected to a specified number of neigbors on each side. This graph is a regular lattice. Then, each edge is rewired with probability p (i.e., with probability p an edge is dropped and reconnected between two random nodes).

g <- sample_smallworld(dim=1, size=30, nei=1, p=0)
plot(g, layout = layout_with_kk(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=30, nei=2, p=0)
plot(g, layout = layout_with_kk(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=30, nei=4, p=0)
plot(g, layout = layout_with_kk(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=30, nei=4, p=0.05)
plot(g, layout = layout_with_kk(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=30, nei=4, p=0.1)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=30, nei=4, p=1)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=100, nei=4, p=0)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=100, nei=4, p=0.1)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_smallworld(dim=1, size=100, nei=4, p=0.2)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

When the rewiring probability is 1, random graphs created from the small world algorithm are very nearly equivalent to Erdos-Reyni random graphs…

g <- sample_smallworld(dim=1, size=100, nei=4, p=1) # this has a mean degree of 8
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_gnm(n = 100, m = 400) # corresponds to mean degree of 8 = m / ((n * (n -1))/2) * (n - 1)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

g <- sample_gnp(n = 100, p = 0.0808) # corresponds to mean degree of 8 = p * (n -1)
plot(g, layout = layout_in_circle(g))

plot_dd(g)

Important Point 1

With appropriate rewiring probabilities, small world random graphs can have more realistic (higher) measures of transitivity or clustering (i.e., that are more like those of real world graphs) than random graphs with similar numbers of vertices and edges. For example, compare the transitivities across the following…

# Harry Potter graph
hp <- read_csv("data/hp.csv", col_names = TRUE)
Rows: 222 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (4): from, to, from_name, to_name

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hp <- graph_from_data_frame(hp, directed = FALSE)
plot(hp)

plot_dd(hp)

# similar small world graph
sw <- sample_smallworld(dim = 1, size = 64, nei = 4, p = 1) # try tweaking p... when p = 1, graph resembles random graph
plot(sw)

plot_dd(sw)

# similar random graphs
rg1 <- sample_gnp(n = 64, p = 222/((64 * 63)/2))
plot(rg1)

plot_dd(rg1)

rg2 <- sample_gnm(n = 64, m = 222)
plot(rg2)

plot_dd(rg2)

Note that the transitivity of the small-world graph is greater and more similar to that of the “real world” Harry Potter graph (in contrast to those of the Erdos-Reyni random graphs).

Also note, though, that the degree distributions and edge densities of both the small world and random graphs are poor matches to that of the “real world” graph… the degree distributions are unimodal rather than right-skewed, have much lower variance (lower, even, in the small-world graphs compared to random graphs), and lack a long tail and the edge densities are much higher.

The mean distance and diameter of the small world (and even the random graphs) is also shorter than in the original Harry Potter network. This is an example of the “small-world phenomenon”, where nodes are linked through a short chain of other nodes, even in very large graphs.

In the example below, note that at very low values of the rewiring probability, the diameter and mean distance in even very large “small world” graphs are low and approach those of random graphs, while the clustering coefficient is much higher.

rg <- sample_gnp(2000, p = 0.02)
plot_dd(rg)

dist <- list()
diam <- list()
clust <- list()
for (i in 0:100){
  sw <- sample_smallworld(dim = 1, size = 2000, nei = 20, p = i/100)
  diam[[i + 1]] <- diameter(sw)
  dist[[i + 1]] <- round(mean_distance(sw), 4)
  clust[[i + 1]] <- round(transitivity(sw, type = "global"), 4)
}
d <- tibble(p=seq(0, 1, by = 0.01), dist = unlist(dist), diam = unlist(diam), clust = unlist(clust))
ggplot(data = d, aes(x=p, y = diam)) + geom_point()

ggplot(data = d, aes(x=p, y = dist)) + geom_point()

ggplot(data = d, aes(x=p, y = clust)) + geom_point()

sw <- sample_smallworld(dim = 1, size = 2000, nei = 20, p = 0.05)
plot_dd(sw)

Scale-Free Random Graphs

Many real-world graphs (like the Harry Potter example above) have degree distributions where the average and modal degree are low and the distribution is right-skewed and long-tailed. These distributions generally follow a “power-law” where the frequency of nodes with a particular degree decays exponentially as degree increases. These are also known as “scale-free” networks. One random model of network formation that can produce these kinds of degree distributions is called “preferential attachment”, where as new nodes are added into a network, they are preferentially connected to existing nodes in relation to how many connections they already have. In {igraph}, the sample_pa() function can be used to create null graphs using a tuneable preferential attachment algorithm. Note that, with default conditions, preferential attachment will only create networks that are fully connected (no isolated vertices).

For example, consider the following graphs, each with 100 edges and a mean degree of ~2…

g <- sample_pa(n = 100, power = 0.5, directed = FALSE)
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

g <- sample_pa(n = 100, power = 1.5, directed = FALSE)
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

g <- sample_pa(n = 100, power = 2, directed = FALSE)
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

g <- sample_pa(n = 100, power = 3, directed = FALSE)
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

Using the “out.dist=” argument, we can make somewhat more “realistic” looking networks…

g <- sample_pa(n = 64, power = 1, directed = FALSE, out.dist = c(0.1, 0.257, 0.25, 0.125, 0.1, 0.05, 0.05, 0.025, 0.025))
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

# compare to...
V(hp)$color <- "lightblue"
V(hp)[degree(hp) > 5]$color <- "red"
plot.igraph(hp, vertex.size = 5, vertex.label = NA)

plot_dd(hp)

Important Point 1

What makes a “scale free” network is that the degree distribution roughly follows a power-law. If that is the case, then the datapoints for the tail should fall rougly along a horizontal line (when plotted on an untransformed scale) and along a straight line (when plotted on a log-log scale).

g <- sample_pa(n = 10000, power = 1, directed = FALSE)
V(g)$color <- "lightblue"
V(g)[degree(g) > 5]$color <- "red"
plot.igraph(g, vertex.size = 5, vertex.label = NA)

plot_dd(g)

plot(degree.distribution(g),log='xy',
xlab = "Degree", ylab = "Relative Frequency")
Warning: `degree.distribution()` was deprecated in igraph 2.0.0.
ℹ Please use `degree_distribution()` instead.
Warning in xy.coords(x, y, xlabel, ylabel, log): 43 y values <= 0 omitted from
logarithmic plot

Our Harry Potter network does not follow this exactly, but it is pretty close…

plot(degree.distribution(hp),log='xy',
xlab = "Degree", ylab = "Relative Frequency")
Warning in xy.coords(x, y, xlabel, ylabel, log): 13 y values <= 0 omitted from
logarithmic plot